import requests
import scipy.stats
import pandas as pd
import numpy as np
import json
from pandas import read_csv
from statsmodels.graphics.tsaplots import plot_acf
from pandas import datetime
from pandas import DataFrame
from statsmodels.tsa.arima_model import ARIMA
from matplotlib import pyplot
from datetime import datetime
from pmdarima.arima import auto_arima
import seaborn as sns
import warnings
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
warnings.filterwarnings("ignore")
URL = 'http://167.172.183.67'
USERNAME = "Group10"
PASSWORD = "q7NQ4hVoe7EUfxHV"
submit_now = False
def get_token(username, password):
body = {
"username": username,
"password": password
}
r = requests.post(f'{URL}/token/', data=body)
r = r.json()
token = r["key"]
return token
def get_data(token, start_date='2020-03-20'):
# Date format : YEAR-MONTH-DAY
header = {'Authorization': f'Token {token}'}
r = requests.get(f'{URL}/dataset/', params={'start_date': start_date}, headers=header)
r = r.json()
data = pd.DataFrame.from_dict(r)
data["event_date"] = pd.to_datetime(data["event_date"])
data = data.sort_values(by=["event_date"])
return data
token = get_token(USERNAME,PASSWORD)
def get_new_data(token):
new_data = get_data(token)
new_unique_dates = new_data["event_date"].unique()
new_data["product_content_id"] = new_data["product_content_id"].apply(lambda x: int(x))
current_data = pd.read_csv("challenge_unchanged.csv")
current_data["event_date"] = current_data["event_date"].apply(lambda x: datetime.strptime(x,"%Y-%m-%d"))
current_unique_dates = current_data["event_date"].unique()
for date in new_unique_dates:
if(date not in current_unique_dates):
temp = new_data[new_data["event_date"]==date]
current_data =pd.concat([temp,current_data],ignore_index=True)
return new_data
data = get_new_data(token)
product_grouped = data.groupby("product_content_id")
date_grouped = data.groupby("event_date")
unique_products = data["product_content_id"].unique()
unique_dates = data["event_date"].unique()
dis = 32939029
yuz = 85004
islak = 4066298
kulaklik = 6676673
supurge = 7061886
tayt = 31515569
bikini = 5926527
mont = 3904356
prods = [kulaklik,yuz,islak, supurge, mont, bikini, tayt, dis]
prod_names = ['kulaklik','yuz','islak', 'supurge', 'mont', 'bikini', 'tayt', 'dis']
def func(row):
for i in range(8):
if (row== prods[i]):
val = prod_names[i]
return val
data["names"] = data["product_content_id"].apply(func)
day = datetime.today()
data.to_csv("daily_data/"+str(day.day) + "_may_data.csv")
data["visit_conversion_rate"] = data["sold_count"]/ data["visit_count"]
data["favored_conversion_rate"] = data["sold_count"]/ data["favored_count"]
data["basket_conversion_rate"] = data["sold_count"]/ data["basket_count"]
data =data.sort_values(["event_date","names"],ascending = False).reset_index().drop("index",axis = 1)
data["twodays_lagged_favored"] = data["favored_count"].shift(-16)
data["twodays_lagged_basket"] = data["basket_count"].shift(-16)
data["twodays_lagged_visit"] = data["visit_count"].shift(-16)
data["twodays_lagged_category_visits"] = data["category_visits"].shift(-16)
data["threedays_lagged_favored"] = data["favored_count"].shift(-24)
data["threedays_lagged_basket"] = data["basket_count"].shift(-24)
names_grouped = data.groupby("names")
unique_names = data["names"].unique()
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
traces.append(go.Scatter(x=k.event_date, y=k.sold_count, name=products, mode='lines'))
fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
datetime(last_date[0], last_date[1], last_date[2])],
yaxis_range = [0,2000],title = "Sales vs Time")
fig.show()
plot(fig,filename = "plots/sales_vs_time.html")
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
traces.append(go.Scatter(x=k.event_date, y=k.visit_conversion_rate, name=products, mode='lines'))
fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
datetime(last_date[0], last_date[1], last_date[2])],
yaxis_range = [0,0.2],title = "Visit_Conversion_Rate")
fig.show()
plot(fig,filename = "plots/Visit_Conversion_Rate.html")
# set parameters for graph
init_date = [2020,4,1]
last_date = [2020,5,datetime.today().day]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
traces.append(go.Scatter(x=k.event_date, y=k.visit_count, name=products, mode='lines'))
fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
datetime(last_date[0], last_date[1], last_date[2])],
yaxis_range = [0,50000],
title = "Visit vs Time")
fig.show()
plot(fig,filename = "plots/Visit_vs_Time.html")
# set parameters for graph
init_date = [2020,3,1]
last_date = [2020,6,3]
import plotly.graph_objs as go
fig = go.Figure()
traces = []
for products,k in data.groupby("names"):
traces.append(go.Scatter(x=k.event_date, y=k.category_visits, name=products, mode='lines'))
fig = go.Figure(data = traces)
fig.update_layout(xaxis_range=[datetime(init_date[0], init_date[1], init_date[2]),
datetime(last_date[0], last_date[1], last_date[2])],
title = "Categorty_Visits")
fig.show()
plot_acf(data.groupby("names").get_group('bikini')['sold_count'],lags = 7)
def rolling_analys(name,data = data,size=30,window=7):
#temp = data.iloc[0:8*size]
temp1 = data
#temp1 = temp[temp["names" ] == name]
#temp1.reset_index(inplace = True)
temp1['z_data'] = (temp1['sold_count'] - temp1.sold_count.rolling(window=window).mean()) / temp1.sold_count.rolling(window=window).std()
temp1['zp_data'] = temp1['z_data'] - temp1['z_data'].shift(window)
fig, ax = pyplot.subplots(3,figsize=(12, 9))
ax[0].set_title(name + ' window: ' + str(window) + " raw rolling")
ax[0].plot(temp1.index, temp1.sold_count, label='raw data')
ax[0].plot(temp1.sold_count.rolling(window=window).mean(), label="rolling mean");
ax[0].plot(temp1.sold_count.rolling(window=window).std(), label="rolling std ");
ax[0].legend()
ax[1].set_title(name + ' window: ' + str(window) + " de-trended rolling")
ax[1].plot(temp1.index, temp1.z_data, label="de-trended data")
ax[1].plot(temp1.z_data.rolling(window=window).mean(), label="rolling mean");
ax[1].plot(temp1.z_data.rolling(window=window).std(), label="rolling std ");
ax[1].legend()
ax[2].set_title(name + ' window: ' + str(window) + " lag differenced de-trended")
ax[2].plot(temp1.index, temp1.zp_data, label= str(window) + " lag differenced de-trended data")
ax[2].plot(temp1.zp_data.rolling(window=window).mean(), label="rolling mean");
ax[2].plot(temp1.zp_data.rolling(window=window).std(), label="rolling std ");
ax[2].legend()
#return {"z_mean" :temp1.z_data.rolling(window=window).mean().mean(),"z_std": temp1.z_data.rolling(window=window).std()}
to_roll = names_grouped.get_group("kulaklik").sort_values("event_date",ascending = False).reset_index(drop = True)
rolling_analys("kulaklik",to_roll.iloc[0:60],window = 7)
to_plot = []
for name in unique_names:
to_plot.append(names_grouped.get_group(name).iloc[0:30][[
"price","sold_count","twodays_lagged_favored","visit_count",
"twodays_lagged_basket","twodays_lagged_visit"]])
for i in range(8):
g = sns.pairplot(data=to_plot[i],
kind="reg",y_vars = "sold_count",
x_vars =["price","twodays_lagged_favored","twodays_lagged_basket",
"visit_count","twodays_lagged_visit"])
g.fig.suptitle(unique_names[i], y=1.08)
to_plot = []
for name in unique_names:
to_plot.append(
names_grouped.get_group(name).iloc[0:15][[
"category_sold", "sold_count", "category_brand_sold",
"category_visits", "ty_visits","twodays_lagged_category_visits"
]])
for i in range(8):
g = sns.pairplot(data=to_plot[i],
kind="reg",
y_vars="sold_count",
x_vars=[
"category_sold", "category_brand_sold",
"category_visits", "ty_visits","twodays_lagged_category_visits"
])
g.fig.suptitle(unique_names[i], y=1.08)
corr_coefs = pd.DataFrame(index = unique_names, columns = data.columns)
p_values = pd.DataFrame(index = unique_names, columns = data.columns)
for name in unique_names:
temp = names_grouped.get_group(name).iloc[0:60]["sold_count"]
for column in data.columns:
try:
temp2 = names_grouped.get_group(name).iloc[0:60][column]
corr_coefs.at[name,column] = np.corrcoef(temp,temp2)[0,1]
p_values.at[name,column] = scipy.stats.pearsonr(temp, temp2)[1]
except:
continue
corr_coefs.drop(["event_date","product_content_id","sold_count","names"],inplace = True, axis = 1)
significants = {}
for name in unique_names:
significant = []
for column in corr_coefs.columns:
if((p_values.at[name,column] <0.1)):
if ((corr_coefs.at[name,column] > 0.2) or (corr_coefs.at[name,column] <-0.2)):
significant.append(column)
significants[name] = significant
corr_coefs